import_projection <- function(proj_dir, proj_name){
  # Import a 2D projection -- this is used for visualization
  proj_coords <- read.csv(paste(proj_dir, proj_name, sep="/"), header=F, sep=" ")
  colnames(proj_coords) = c("x1","x2")
  
  return(proj_coords)
}

import_clusters <- function(clust_dir, clust_name){
  # Import cluster labels.
  clust_labels <- read.csv(paste(clust_dir, clust_name, sep="/"), header=F)
  colnames(clust_labels) <- "cluster"
  clust_labels$cluster <- factor(clust_labels$cluster)

  return(clust_labels)
}

import_aux <- function(aux_dir, aux_file){
  # Import auxiliary data (in this case, population labels)
  samples <- read.csv(paste(aux_dir, aux_file, sep="/"), sep="\t")
  samples$sample <- as.character(samples$sample)
  samples <- samples[,-c(3)]
  
  return(samples)
}

import_descriptions <- function(aux_dir, aux_file){
  # Import population descriptions by population code
  temp <- read.csv(paste(aux_dir, aux_file, sep="/"), sep="\t")
  # we only want the two relevant columns
  # last row is "Total"
  temp <- temp[1:nrow(temp)-1,c(1,2)]
  temp$Population.Description <- as.character(temp$Population.Description)
  
  return(temp)
}

import_ids <- function(id_dir, id_file){
  # Import the 1000GP IDs as ordered in the original VCF
  ids <- read.csv(paste(id_dir, id_file, sep="/"), sep = " ", header=F)
  
  return(ids)
}

get_cluster_means <- function(in_data, dim1, dim2, cluster_var){
  # Calculate the mean coordinate of a population variable (e.g. cluster)
  cluster_coords <- in_data[,which(colnames(in_data) %in% c(cluster_var, dim1, dim2))]

  coord_means_1 <- cluster_coords %>%
    group_by(!! sym(cluster_var)) %>%
    summarize(x1 = mean(!! sym(dim1)))
  
  coord_means_2 <- cluster_coords %>%
    group_by(!! sym(cluster_var)) %>%
    summarize(x2 = mean(!! sym(dim2)))
  
  coord_means <- left_join(coord_means_1, coord_means_2, by = c(cluster_var))
  
  return(coord_means)
}
proj_dir <-"data"
proj_name <- "1000G_UMAP_PC100_NC2_NN15_MD0.5_admixturenotebook"

cluster_dir <- "data"

aux_dir <- "data/"
aux_file <- "affy_samples.20141118.panel"

label_file <- "20131219.populations.tsv"
id_file <- "1KGP_ids.txt"

coords <- import_projection(proj_dir, proj_name)

samples <- import_aux(aux_dir, aux_file)
descriptions <- import_descriptions(aux_dir, label_file)
ids <- import_ids(aux_dir, id_file)
s <- 0.1
a <- 0.4

f <- "hdbscan_labels_min25_EPS0.5_1000G_UMAP_PC16_NC5_NN50_MD0.01_euclidean_2019814225811"

clusters <- import_clusters(cluster_dir, f)

main_data <- data.frame(cbind(ids, coords))
colnames(main_data)[1] <- "ID"
main_data$ID <- as.character(main_data$ID)

main_data <- main_data[,-c(2)]

# add population labels
main_data <- merge(main_data, samples, by.x = "ID", by.y = "sample")

# add cluster labels
main_data$cluster <- clusters$cluster

pop_means <- get_cluster_means(main_data, "x1", "x2", "pop")

# Note that in this 2D visualization, the ITU and GIH populations  are split into multiple clusters
pop_means <- subset(pop_means, !(pop %in% c("ITU","GIH")))
pop_means <- rbind.data.frame(pop_means, list("ITU",-7,5))
pop_means <- rbind.data.frame(pop_means, list("ITU",15,16))
pop_means <- rbind.data.frame(pop_means, list("GIH",-7,5))
pop_means <- rbind.data.frame(pop_means, list("GIH",2,15))

ggplot(main_data, aes(x=x1, y=x2, colour=pop)) +
  geom_point(size=s, alpha=a) +
  scale_color_manual(name = "pop", values=pal) +
  ggtitle("Thousand Genomes, coloured by population") +
  xlab("UMAP1") + ylab("UMAP2") + theme_bw() + 
  theme(legend.position = "none") + 
  geom_label_repel(data = pop_means,
                   aes(x=x1, y=x2, label=pop, colour = as.factor(pop)), size=5, alpha=0.6, label.size=0.05)

# In this visualization, cluster 16 (mostly ITU individuals) is split into multiple clusters
cluster_means <- get_cluster_means(main_data, "x1", "x2", "cluster")

cluster_means <- subset(cluster_means, cluster!=16)
cluster_means_extra <- cluster_means[FALSE,]
cluster_means_extra <- rbind.data.frame(cluster_means_extra, data.frame(cluster="16",x1=-4.3,x2=5))
cluster_means_extra <- rbind.data.frame(cluster_means_extra, data.frame(cluster="16",x1=15,x2=14))

# Colour palette
colourCount <- length(unique(main_data$cluster))
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
colpal <- colorRampPalette(brewer.pal(8, "Set1"))(colourCount)

# manually change some colours:
# 14 is too yellow
colpal[15] <- "#FFBC2D"

# randomly permute the colours for clarity.
# sequential clusters are often (1) next to each other and (2) similarly coloured.
colpal <- sample(colpal)

# Some colours too similar:
# 11 and 12, 9 and 10, 
ggplot(main_data, aes(x=x1, y=x2, colour=cluster)) +#, shape=cluster)) +
        geom_point(size=s, alpha=a) +
  scale_color_manual(values = colpal) +
  geom_label_repel(data = cluster_means,
                   aes(x=x1, y=x2, label=cluster, colour = as.factor(cluster)), size=5, alpha=0.6) +
  geom_label(data = cluster_means_extra,
             aes(x=x1, y=x2, label=cluster, colour = as.factor(cluster)), size=5, alpha=0.6) +
  ggtitle("Thousand Genomes, coloured by clustering algorithm") +
  xlab("UMAP1") + ylab("UMAP2") + theme_bw() +
  theme(legend.position = "none")

Interactive plots

s <- 2
  
main_data <- data.frame(cbind(ids, coords))
colnames(main_data)[1] <- "ID"
main_data$ID <- as.character(main_data$ID)

main_data <- main_data[,-c(2)]

# add population labels
main_data <- merge(main_data, samples, by.x = "ID", by.y = "sample")

# add cluster labels
main_data$cluster <- clusters$cluster

# unassigned boolean
# The clustering for this demo has been selected to not have any unclustered individuals
main_data$unassigned <- ifelse(main_data$cluster==-1,"unassigned","assigned")

colourCount <- length(unique(main_data$cluster))
getPalette = colorRampPalette(brewer.pal(9, "Set1"))

fig_main <- plot_ly(data = main_data, x = ~x1, y = ~x2,
                    marker = list(size=s)) %>%
  add_markers(color = ~pop, colors = pal, 
              text = ~paste( "ID", ID, "\nCluster", cluster),
              type = "scattergl", mode = "markers")

fig_main
fig_clusters <- plot_ly(data = main_data, x = ~x1, y = ~x2,
                        marker = list(size=s)) %>%
  add_markers(color = ~cluster, colors = getPalette(colourCount), 
             text = ~paste( "ID", ID, "\nPopulation", pop),
             type = "scattergl", mode = "markers")


fig_clusters
print_pop_table <- function(p, in_data){
  # Print the cluster membership of a specified population
  cat(paste(p, "below"))
  temp_data <- in_data %>% filter(pop==p)
  print(table(temp_data$cluster))
  cat("Proportion of population in largest cluster:")
  print(max(table(temp_data$cluster))/sum(table(temp_data$cluster)))
  cat("\n")
}

for (p in unique(main_data$pop)){
  print_pop_table(p, main_data)
}
## GBR below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0 105   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## FIN below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
## 104   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## CHS below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 171 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## PUR below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0 145   0   0   0   4   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.9731544
## 
## CDX below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1 
##  18  19  20 
## 104   0   0 
## Proportion of population in largest cluster:[1] 0.9904762
## 
## CLM below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   4 142   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.9726027
## 
## IBS below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0 162   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## PEL below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0 128   0   0   0   0   1   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.9922481
## 
## PJL below
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  0  0  0  0  0  0  0  0  0  0  0  0  0  0 48 95 12  0  0  0  0 
## Proportion of population in largest cluster:[1] 0.6129032
## 
## KHV below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   3 
##  18  19  20 
## 118   0   0 
## Proportion of population in largest cluster:[1] 0.9752066
## 
## ACB below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0 122   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## GWD below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0 179   1 
## Proportion of population in largest cluster:[1] 0.9944444
## 
## ESN below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0 172   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## BEB below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0 133   0   0   0   0   0   0   0  10   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.9300699
## 
## MSL below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0 122 
## Proportion of population in largest cluster:[1] 1
## 
## STU below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   4 124   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.96875
## 
## ITU below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   9 109   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.9237288
## 
## CEU below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0 183   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## YRI below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   1 181   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.9945055
## 
## CHB below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 105 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## JPT below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0   0   0   0   0   0   0   0 104   0   0   0   1 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.9904762
## 
## LWK below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0 110   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## ASW below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0   0   0   0 103   0   0   0   0   1   3   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 0.9626168
## 
## MXL below
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  0  0  0  7  0  0  0  0  0  0  0 97  0  0  0  0  0  0  0  0  0 
## Proportion of population in largest cluster:[1] 0.9326923
## 
## TSI below
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
##   0   0   0 111   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
##  18  19  20 
##   0   0   0 
## Proportion of population in largest cluster:[1] 1
## 
## GIH below
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  0  0  0  0  0 69  0  0  0  0  0  0  0  0  0 41  1  0  0  0  0 
## Proportion of population in largest cluster:[1] 0.6216216